# load dataset
train <- read.csv("./data/train.csv", header = TRUE, sep = ",")
source("./utils.r")

Preprocessing

Remove empty rows and normalize data

# remove empty rows
train <- train[complete.cases(train), ]
# data normalization except the last column
train[, 2:(ncol(train) - 1)] <- scale(train[, 2:(ncol(train) - 1)])

train.x <- train[, 2:(ncol(train) - 1)]
train.y <- train[, ncol(train)]

Ridge regression

# ridge regression
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
num_lambdas <- 100
lambdas <- 10^seq(5, -5, length = num_lambdas)
ridge <- glmnet(train.x, train.y, family = "multinomial", alpha = 0, lambda = lambdas)

Parameter path

par(mfrow = c(1, 1))
# plot the parameter path
plot(ridge, xvar = "lambda", label = TRUE)

Test Error and Accuracy

  • Error is defined by cross entropy loss
    1nCinyiTlog(pi)+(1yi)Tlog(1pi)
    .
  • Accuracy is the number of correct predictions divided by the total number of predictions.
ridge.cv <- cv.multinomial(train.x, train.y, alpha = 0, lambda = lambdas)

λ that minimizes cv error

lambdas[which.min(ridge.cv$error)]
## [1] 0.001321941

Minimized cv error

ridge.cv$error[which.min(ridge.cv$error)]
## [1] 0.2914767

Plot

Error as function of λ

# plot test mse
plot(lambdas, ridge.cv$error, type = "l", xlab = "lambda", ylab = "CV error", main = "Ridge Regression Error", log = "x")

Accuracy as function of lambda

plot(ridge$lambda, ridge.cv$accuracy, type = "l", xlab = "lambda", ylab = "CV accuracy", main = "Ridge Regression Accuracy", log = "x")

λ that maximizes accuracy

lambdas[which.max(ridge.cv$accuracy)]
## [1] 0.0008302176

Higheset accuracy

ridge.cv$accuracy
##   [1] 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976
##   [8] 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976
##  [15] 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976
##  [22] 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976
##  [29] 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976 0.3960976
##  [36] 0.3956098 0.3960976 0.4004878 0.4112195 0.4229268 0.4424390 0.4590244
##  [43] 0.4780488 0.4995122 0.5087805 0.5185366 0.5219512 0.5268293 0.5321951
##  [50] 0.5365854 0.5400000 0.5434146 0.5448780 0.5453659 0.5468293 0.5526829
##  [57] 0.5560976 0.5570732 0.5565854 0.5609756 0.5653659 0.5682927 0.5673171
##  [64] 0.5682927 0.5697561 0.5682927 0.5678049 0.5673171 0.5668293 0.5668293
##  [71] 0.5687805 0.5697561 0.5702439 0.5712195 0.5707317 0.5702439 0.5697561
##  [78] 0.5702439 0.5712195 0.5712195 0.5717073 0.5717073 0.5717073 0.5712195
##  [85] 0.5712195 0.5712195 0.5712195 0.5712195 0.5712195 0.5712195 0.5712195
##  [92] 0.5712195 0.5712195 0.5712195 0.5712195 0.5712195 0.5712195 0.5712195
##  [99] 0.5712195 0.5712195

KNN

library(class)
max_k <- 200
# cross validation for knn
knn.cv.accuracy <- rep(0, max_k)
for (k in 1:max_k) {
    knn.cv.accuracy[k] <- mean(train.y == knn.cv(train.x, train.y, k = k))
}
# apply a filter
q <- 16
knn.cv.accuracy.smooth <- filter(knn.cv.accuracy, rep(1/q, q), sides = 2)
# plot the knn cv accuracy
plot(1:max_k, knn.cv.accuracy, type = "l", xlab = "k", ylab = "CV accuracy", main = "KNN CV Accuracy")

# plot the smoothed knn cv accuracy
lines(1:max_k, knn.cv.accuracy.smooth, col = "red")

legend("bottomright", c("KNN CV Accuracy", "KNN CV Accuracy Smoothed"), col = c("black", "red"), lty = 1)

Best k from cv

k.cv.best <- which.max(knn.cv.accuracy.smooth)
k.cv.best
## [1] 171

Best cv accuracy

knn.cv.accuracy[which.max(knn.cv.accuracy)]
## [1] 0.5622568

Linear Discriminant Analysis (LDA)

# lda 
library(MASS)

lda <- lda(quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, data = train, CV=TRUE)

LDA accuracy

lda.accuracy <- mean(train.y == lda$class)
lda.accuracy
## [1] 0.5695525
# qda not applicable because small groups are too small